title: “Beers_Analysis” author: “Sachin Chavan” date: “10/13/2019” output: html_document —

Load follwing libraries

Load data into memory from csv

breweries_ds <- read.csv('data/Breweries.csv',header = TRUE)
beers_ds     <- read.csv('data/Beers.csv'    ,header = TRUE)
bmerged_ds   <- merge(breweries_ds,beers_ds,by.x="Brew_ID",by.y="Brewery_id")
names(bmerged_ds)[2] <- "BreweryName"
names(bmerged_ds)[5] <- "BeerName"

Structures of the datasets

str(breweries_ds)
## 'data.frame':    558 obs. of  4 variables:
##  $ Brew_ID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name   : Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 12 266 319 201 136 227 477 59 491 ...
##  $ City   : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 200 122 299 300 62 91 48 152 136 ...
##  $ State  : Factor w/ 51 levels " AK"," AL"," AR",..: 24 18 20 5 5 41 6 23 23 23 ...
str(beers_ds)
## 'data.frame':    2410 obs. of  7 variables:
##  $ Name      : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1638 577 1704 1842 1819 268 1160 758 1093 486 ...
##  $ Beer_ID   : int  1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
##  $ ABV       : num  0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
##  $ IBU       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Brewery_id: int  409 178 178 178 178 178 178 178 178 178 ...
##  $ Style     : Factor w/ 100 levels "","Abbey Single Ale",..: 19 18 16 12 16 80 18 22 18 12 ...
##  $ Ounces    : num  12 12 12 12 12 12 12 12 12 12 ...
str(bmerged_ds)
## 'data.frame':    2410 obs. of  10 variables:
##  $ Brew_ID    : int  1 1 1 1 1 1 2 2 2 2 ...
##  $ BreweryName: Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 355 355 355 355 355 12 12 12 12 ...
##  $ City       : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 228 228 228 228 228 200 200 200 200 ...
##  $ State      : Factor w/ 51 levels " AK"," AL"," AR",..: 24 24 24 24 24 24 18 18 18 18 ...
##  $ BeerName   : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1640 1926 1525 802 1258 2185 71 458 1218 43 ...
##  $ Beer_ID    : int  2689 2688 2687 2692 2691 2690 2683 2686 2685 2684 ...
##  $ ABV        : num  0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
##  $ IBU        : int  38 25 47 50 26 19 42 68 80 25 ...
##  $ Style      : Factor w/ 100 levels "","Abbey Single Ale",..: 83 22 57 16 77 48 18 12 46 77 ...
##  $ Ounces     : num  16 16 16 16 16 16 16 16 16 16 ...

Merged dataset

head(bmerged_ds)
##   Brew_ID        BreweryName        City State      BeerName Beer_ID   ABV
## 1       1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060
## 2       1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060
## 3       1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056
## 4       1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045
## 5       1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049
## 6       1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048
##   IBU                               Style Ounces
## 1  38                         Pumpkin Ale     16
## 2  25                     American Porter     16
## 3  47 Extra Special / Strong Bitter (ESB)     16
## 4  50                        American IPA     16
## 5  26                  Milk / Sweet Stout     16
## 6  19                   English Brown Ale     16
tail(bmerged_ds)
##      Brew_ID                   BreweryName          City State
## 2405     556         Ukiah Brewing Company         Ukiah    CA
## 2406     557       Butternuts Beer and Ale Garrattsville    NY
## 2407     557       Butternuts Beer and Ale Garrattsville    NY
## 2408     557       Butternuts Beer and Ale Garrattsville    NY
## 2409     557       Butternuts Beer and Ale Garrattsville    NY
## 2410     558 Sleeping Lady Brewing Company     Anchorage    AK
##                       BeerName Beer_ID   ABV IBU                   Style
## 2405             Pilsner Ukiah      98 0.055  NA         German Pilsener
## 2406         Porkslap Pale Ale      49 0.043  NA American Pale Ale (APA)
## 2407           Snapperhead IPA      51 0.068  NA            American IPA
## 2408         Moo Thunder Stout      50 0.049  NA      Milk / Sweet Stout
## 2409  Heinnieweisse Weissebier      52 0.049  NA              Hefeweizen
## 2410 Urban Wilderness Pale Ale      30 0.049  NA        English Pale Ale
##      Ounces
## 2405     12
## 2406     12
## 2407     12
## 2408     12
## 2409     12
## 2410     12

Summary Statistics of both datasets

summary(breweries_ds)
##     Brew_ID                           Name           City    
##  Min.   :  1.0   Blackrocks Brewery     :  2   Portland: 17  
##  1st Qu.:140.2   Blue Mountain Brewery  :  2   Boulder :  9  
##  Median :279.5   Lucette Brewing Company:  2   Chicago :  9  
##  Mean   :279.5   Oskar Blues Brewery    :  2   Seattle :  9  
##  3rd Qu.:418.8   Otter Creek Brewing    :  2   Austin  :  8  
##  Max.   :558.0   Sly Fox Brewing Company:  2   Denver  :  8  
##                  (Other)                :546   (Other) :498  
##      State    
##   CO    : 47  
##   CA    : 39  
##   MI    : 32  
##   OR    : 29  
##   TX    : 28  
##   PA    : 25  
##  (Other):358
summary(beers_ds)
##                      Name         Beer_ID            ABV         
##  Nonstop Hef Hop       :  12   Min.   :   1.0   Min.   :0.00100  
##  Dale's Pale Ale       :   6   1st Qu.: 808.2   1st Qu.:0.05000  
##  Oktoberfest           :   6   Median :1453.5   Median :0.05600  
##  Longboard Island Lager:   4   Mean   :1431.1   Mean   :0.05977  
##  1327 Pod's ESB        :   3   3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Boston Lager          :   3   Max.   :2692.0   Max.   :0.12800  
##  (Other)               :2376                    NA's   :62       
##       IBU           Brewery_id                               Style     
##  Min.   :  4.00   Min.   :  1.0   American IPA                  : 424  
##  1st Qu.: 21.00   1st Qu.: 94.0   American Pale Ale (APA)       : 245  
##  Median : 35.00   Median :206.0   American Amber / Red Ale      : 133  
##  Mean   : 42.71   Mean   :232.7   American Blonde Ale           : 108  
##  3rd Qu.: 64.00   3rd Qu.:367.0   American Double / Imperial IPA: 105  
##  Max.   :138.00   Max.   :558.0   American Pale Wheat Ale       :  97  
##  NA's   :1005                     (Other)                       :1298  
##      Ounces     
##  Min.   : 8.40  
##  1st Qu.:12.00  
##  Median :12.00  
##  Mean   :13.59  
##  3rd Qu.:16.00  
##  Max.   :32.00  
## 
summary(bmerged_ds)
##     Brew_ID                          BreweryName             City     
##  Min.   :  1.0   Brewery Vivant            :  62   Grand Rapids:  66  
##  1st Qu.: 94.0   Oskar Blues Brewery       :  46   Portland    :  64  
##  Median :206.0   Sun King Brewing Company  :  38   Chicago     :  55  
##  Mean   :232.7   Cigar City Brewing Company:  25   Indianapolis:  43  
##  3rd Qu.:367.0   Sixpoint Craft Ales       :  24   San Diego   :  42  
##  Max.   :558.0   Hopworks Urban Brewery    :  23   Boulder     :  41  
##                  (Other)                   :2192   (Other)     :2099  
##      State                        BeerName       Beer_ID      
##   CO    : 265   Nonstop Hef Hop       :  12   Min.   :   1.0  
##   CA    : 183   Dale's Pale Ale       :   6   1st Qu.: 808.2  
##   MI    : 162   Oktoberfest           :   6   Median :1453.5  
##   IN    : 139   Longboard Island Lager:   4   Mean   :1431.1  
##   TX    : 130   1327 Pod's ESB        :   3   3rd Qu.:2075.8  
##   OR    : 125   Boston Lager          :   3   Max.   :2692.0  
##  (Other):1406   (Other)               :2376                   
##       ABV               IBU                                    Style     
##  Min.   :0.00100   Min.   :  4.00   American IPA                  : 424  
##  1st Qu.:0.05000   1st Qu.: 21.00   American Pale Ale (APA)       : 245  
##  Median :0.05600   Median : 35.00   American Amber / Red Ale      : 133  
##  Mean   :0.05977   Mean   : 42.71   American Blonde Ale           : 108  
##  3rd Qu.:0.06700   3rd Qu.: 64.00   American Double / Imperial IPA: 105  
##  Max.   :0.12800   Max.   :138.00   American Pale Wheat Ale       :  97  
##  NA's   :62        NA's   :1005     (Other)                       :1298  
##      Ounces     
##  Min.   : 8.40  
##  1st Qu.:12.00  
##  Median :12.00  
##  Mean   :13.59  
##  3rd Qu.:16.00  
##  Max.   :32.00  
## 

Visual Summary

Total Breweries per state

b_states <- breweries_ds %>% group_by(State) %>% count() %>% arrange(desc(n))
b_states$State <- factor(b_states$State, levels = b_states$State) 

ggplot(b_states, aes(x=State, y=n)) + 
  geom_bar(stat="identity", width=.8, fill="tomato3") + 
  labs(title="Ordered Bar Chart", 
       subtitle="State Vs Number of Breweries", 
       caption="Beer Analysis") + 
  xlab("State")+
  ylab("Total Breweries")+
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.6))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=10),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.2)))

Missing Values

vis_dat(bmerged_ds,palette = "qual")

beer_ounces <- beers_ds
beer_ounces$Ounces <- as.factor(beer_ounces$Ounces)

gg_miss_var(bmerged_ds,facet=State)+
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.3))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=8),
  axis.text.y   = element_text(vjust=0.2,size=4),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.2)))

gg_miss_var(bmerged_ds,facet=Style)+
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.3))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.3)),
  axis.text.x   = element_text(vjust=0.6,size=8),
  axis.text.y   = element_text(vjust=0.2,size=4),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.2)))

gg_miss_var(beer_ounces,facet=Ounces)+
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.3))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=10),
  axis.text.y   = element_text(vjust=0.2,size=7),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.2)))

### Missing Values continued * Missing values by state * Missing values by Style

aq_shadow <- bind_shadow(bmerged_ds)
glimpse(aq_shadow)
## Observations: 2,410
## Variables: 20
## $ Brew_ID        <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ BreweryName    <fct> NorthGate Brewing , NorthGate Brewing , NorthGate…
## $ City           <fct> Minneapolis, Minneapolis, Minneapolis, Minneapoli…
## $ State          <fct>  MN,  MN,  MN,  MN,  MN,  MN,  KY,  KY,  KY,  KY,…
## $ BeerName       <fct> Pumpion, Stronghold, Parapet ESB, Get Together, M…
## $ Beer_ID        <int> 2689, 2688, 2687, 2692, 2691, 2690, 2683, 2686, 2…
## $ ABV            <dbl> 0.060, 0.060, 0.056, 0.045, 0.049, 0.048, 0.042, …
## $ IBU            <int> 38, 25, 47, 50, 26, 19, 42, 68, 80, 25, 17, 25, 2…
## $ Style          <fct> Pumpkin Ale, American Porter, Extra Special / Str…
## $ Ounces         <dbl> 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1…
## $ Brew_ID_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ BreweryName_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ City_NA        <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ State_NA       <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ BeerName_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Beer_ID_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ABV_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ IBU_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Style_NA       <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Ounces_NA      <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
bmerged_ds %>%
  bind_shadow() %>%
  group_by(IBU_NA) %>%
  summarise_at(.vars = "ABV",
               .funs = c("mean", "sd", "var", "min", "max"),
               na.rm = TRUE)
## # A tibble: 2 x 6
##   IBU_NA   mean     sd      var   min   max
##   <fct>   <dbl>  <dbl>    <dbl> <dbl> <dbl>
## 1 !NA    0.0599 0.0136 0.000184 0.027 0.125
## 2 NA     0.0596 0.0135 0.000182 0.001 0.128
ggplot(aq_shadow,
       aes(x = ABV,
           colour = IBU_NA)) + 
  geom_density()
## Warning: Removed 62 rows containing non-finite values (stat_density).

ggplot(bmerged_ds, 
       aes(x = ABV, 
           y = IBU)) + 
  geom_miss_point() + 
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.3))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=8),
  axis.text.y   = element_text(vjust=0.2,size=8),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.5)))+
  facet_wrap(~State)

ggplot(bmerged_ds, 
       aes(x = ABV, 
           y = IBU)) + 
  geom_miss_point() + 
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.3))+  
  theme(plot.title    = element_text(size = rel(0.3)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=8),
  axis.text.y   = element_text(vjust=0.2,size=6),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.5)))+
  facet_wrap(~Style)

Imputations

bmerged_final_ds <- bmerged_ds %>% filter(!is.na(ABV) | !is.na(IBU))

non_missing_ibu <- bmerged_final_ds %>% filter(!is.na(IBU))
non_missing_ibu$imputed <- "N"

missing_ibu <- bmerged_final_ds %>% filter(is.na(IBU))
missing_ibu$imputed <- "Y"

model_ibu <- non_missing_ibu %>% lm(formula=log(IBU)~log(ABV))
plot(model_ibu)

summary(model_ibu)
## 
## Call:
## lm(formula = log(IBU) ~ log(ABV), data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.81374 -0.30156  0.08312  0.38514  1.35081 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.99667    0.18115   49.66   <2e-16 ***
## log(ABV)     1.91683    0.06363   30.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.514 on 1403 degrees of freedom
## Multiple R-squared:  0.3927, Adjusted R-squared:  0.3923 
## F-statistic: 907.4 on 1 and 1403 DF,  p-value: < 2.2e-16
missing_ibu$IBU <- round(exp(predict(model_ibu,missing_ibu)),0)

imputed_df <- rbind(non_missing_ibu,missing_ibu)
imputed_df$imputed <- as.factor(imputed_df$imputed)

imputed_df %>% ggplot(aes(x=ABV,y=IBU,color=imputed))+
               geom_point()+ggtitle("ABV Vs IBU") + 
               theme_wsj()+
               theme(axis.text.x = element_text(angle=65, vjust=0.3))+  
               theme(plot.title    = element_text(size = rel(0.5)),
               plot.subtitle = element_text(size = rel(0.5)),
               axis.text.x   = element_text(vjust=0.6,size=8),
               axis.text.y   = element_text(vjust=0.2,size=6),
               axis.title    = element_text(size = rel(0.5)),
               legend.position  = "right",
               legend.direction ="vertical",
               legend.title = element_text(size = rel(0.5)))

Median Alcohol content and Bitternes Vs state

abv_by_states <- imputed_df %>% select(State,ABV) %>% group_by(State) %>%  summarise(Median=as.numeric(median(ABV))) %>% arrange(desc(Median))

abv_by_states$State <- factor(abv_by_states$State, levels = abv_by_states$State) 

ggplot(abv_by_states, aes(x=State, y=Median)) + 
  geom_bar(stat="identity", width=.8, fill="tomato3") + 
  labs(title="Ordered Bar Chart", 
       subtitle="Median ABV Vs State", 
       caption="Beer Analysis") + 
  xlab("State")+
  ylab("Alcohol content")+
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.6))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=7),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.2)))

ibu_by_states <- imputed_df %>% select(State,IBU) %>% group_by(State) %>%  summarise(Median=as.numeric(median(IBU))) %>% arrange(desc(Median))

ibu_by_states$State <- factor(ibu_by_states$State, levels = ibu_by_states$State) 

ggplot(ibu_by_states, aes(x=State, y=Median)) + 
  geom_bar(stat="identity", width=.8, fill="tomato3") + 
  labs(title="Ordered Bar Chart", 
       subtitle="Median IBU Vs State", 
       caption="Beer Analysis") + 
  xlab("State")+
  ylab("Alcohol content")+
  theme_wsj()+
  theme(axis.text.x = element_text(angle=65, vjust=0.6))+  
  theme(plot.title    = element_text(size = rel(0.5)),
  plot.subtitle = element_text(size = rel(0.5)),
  axis.text.x   = element_text(vjust=0.6,size=7),
  axis.title    = element_text(size = rel(0.5)),
  legend.position  = "right",
  legend.direction ="vertical",
  legend.title = element_text(size = rel(0.2)))

Distribution of ABV

imputed_df %>% ggplot(aes(y=ABV,x=State)) +
               geom_boxplot(aes(fill=State), alpha=0.8,show.legend = FALSE) + 
               labs(title="Alcohol Content", 
               caption="Beer Analysis") + 
               xlab("State")+
               ylab("Alcohol content")+
               theme_wsj()+
               theme(axis.text.x = element_text(angle=65, vjust=0.6))+  
               theme(plot.title    = element_text(size = rel(0.5)),
               plot.subtitle = element_text(size = rel(0.5)),
               axis.text.x   = element_text(vjust=0.6,size=7),
               axis.title    = element_text(size = rel(0.5)),
               legend.position  = "right",
               legend.direction ="vertical",
               legend.title = element_text(size = rel(0.2)))

imputed_df %>% ggplot(aes(x=ABV)) +
               geom_density(aes(fill=State), alpha=0.8) + 
               labs(title="Density Plot", 
               caption="Beer Analysis") + 
               xlab("State")+
               ylab("Alcohol content")+
               theme_wsj()+
               theme(axis.text.x = element_text(angle=65, vjust=0.6))+  
               theme(plot.title    = element_text(size = rel(0.5)),
               plot.subtitle = element_text(size = rel(0.5)),
               axis.text.x   = element_text(vjust=0.6,size=7),
               axis.title    = element_text(size = rel(0.5)),
               legend.position  = "right",
               legend.direction ="vertical",
               legend.title = element_text(size = rel(0.2)))
## Warning: Groups with fewer than two data points have been dropped.